In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, cohen_kappa_score, precision_score, recall_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer
from tqdm import tqdm
import warnings
import random
In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message=".*X does not have valid feature names*")
Data preparation¶
In [3]:
data = pd.read_csv('../conceptual_features.csv')
corpus = data[['text.id','text']]
In [4]:
# Recode outcome vars
data['internal_uncontrollable'] = np.where((data['locus.internal']==1) &
(data['controllability.uncontrollable']==1) &
(data['controllability.controllable']==0),1,0)
data['no.internal'] = np.where((data['locus.external']==1) & (data['locus.internal']==0),1,0)
# codebook: 1-no.attribution, 2-no.internal, 3-internal.uncontrollable
data['attribution'] = data.apply(lambda row: row['no.attribution'] + row['no.internal']*2 + row['internal_uncontrollable']*3, axis=1)
# codebook (cont.): 4-internal.controllable
data['attribution'] = data['attribution'].replace(0,4)
data = data.drop(columns=['internal_uncontrollable', 'no.internal'])
data['attribution'].value_counts()
Out[4]:
attribution 4 688 1 253 2 93 3 46 Name: count, dtype: int64
In [5]:
# Droping non-feature/target columns
data = data.drop(columns=['text.id','student.id','exam.failed','exam.date','year.of.exam','text','submit.date','year.of.submission'])
data = data.drop(columns=['exam.preparation','exam.strategies','exam.mistakes','knowledge.gap'])
data = data.drop(columns=['text.len'])
onehot = ['no.attribution',
'locus.external','locus.internal',
'stability.unstable','stability.stable',
'controllability.uncontrollable','controllability.controllable']
# Targets of interest
targets = ['attribution']
# Dimensionality of the feature space
data.drop(columns=targets+onehot).info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1080 entries, 0 to 1079 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 focuspast 1080 non-null float64 1 focuspresent 1080 non-null float64 2 focusfuture 1080 non-null float64 3 correctness.mistakes 1080 non-null int64 4 mastery 1080 non-null int64 5 KC.experience 1080 non-null int64 6 KC.knowledge 1080 non-null int64 7 KC.skill 1080 non-null int64 8 clinical.concept 1080 non-null int64 9 concept.hygiene 1080 non-null int64 10 concept.anatomy 1080 non-null int64 11 concept.pathology 1080 non-null int64 12 concept.patientCare 1080 non-null int64 13 concept.treatment 1080 non-null int64 14 concept.prosthodontics 1080 non-null int64 15 clinical.procedure 1080 non-null int64 16 effort 1080 non-null int64 17 studying.materials 1080 non-null int64 18 strategies 1080 non-null int64 19 challenge 1080 non-null int64 20 exam.delivery 1080 non-null int64 21 person 1080 non-null int64 22 neg_VERB 1080 non-null int64 23 pos_VERB 1080 non-null int64 24 neg_ADV 1080 non-null int64 25 pos_ADV 1080 non-null int64 26 neg_ADJ 1080 non-null int64 27 pos_ADJ 1080 non-null int64 28 neg_words 1080 non-null int64 29 pos_words 1080 non-null int64 30 overall_sentiment 1080 non-null float64 31 percent.neg_VERB 1080 non-null float64 32 percent.pos_VERB 1080 non-null float64 33 percent.neg_ADV 1080 non-null float64 34 percent.pos_ADV 1080 non-null float64 35 percent.neg_ADJ 1080 non-null float64 36 percent.pos_ADJ 1080 non-null float64 37 percent.neg_words 1080 non-null float64 38 percent.pos_words 1080 non-null float64 dtypes: float64(12), int64(27) memory usage: 329.2 KB
In [6]:
random_state = 6052
# Splitting hold out test set
X = data.drop(columns=targets+onehot)
y = data[targets+onehot]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state, shuffle=True)
In [7]:
# Randomly pick instances by class for LIME-based model interpretation
No_Attribution = y_test.query('`no.attribution`==1').index
Int_Unstbl_Ctrl = y_test.query('(`locus.external`==0) & (`locus.internal`==1) & (`stability.unstable`==1) & (`stability.stable`==0) & (`controllability.uncontrollable`==0) & (`controllability.controllable`==1)').index
Ext_Unstbl_Ctrl = y_test.query('(`locus.external`==1) & (`locus.internal`==0) & (`stability.unstable`==1) & (`stability.stable`==0) & (`controllability.uncontrollable`==0) & (`controllability.controllable`==1)').index
Ext_Stbl_Ctrl = y_test.query('(`locus.external`==1) & (`locus.internal`==0) & (`stability.unstable`==0) & (`stability.stable`==1) & (`controllability.uncontrollable`==0) & (`controllability.controllable`==1)').index
Locus2_Stbl2_Ctrl = y_test.query('(`locus.external`==1) & (`locus.internal`==1) & (`stability.unstable`==1) & (`stability.stable`==1) & (`controllability.uncontrollable`==0) & (`controllability.controllable`==1)').index
Int_Stbl_Unctrl = y_test.query('(`locus.external`==0) & (`locus.internal`==1) & (`stability.unstable`==0) & (`stability.stable`==1) & (`controllability.uncontrollable`==1) & (`controllability.controllable`==0)').index
random.seed(random_state)
instances = pd.DataFrame(columns=['index', 'class_fineGrained', 'class_compacted'])
instances.loc[len(instances)] = [random.choice(No_Attribution), 'No attribution', 'No attribution']
instances.loc[len(instances)] = [random.choice(Int_Unstbl_Ctrl), 'Internal - Unstable - Controllable', 'Internal-Controllable']
instances.loc[len(instances)] = [random.choice(Ext_Unstbl_Ctrl), 'External - Unstable - Controllable', 'External']
instances.loc[len(instances)] = [random.choice(Ext_Stbl_Ctrl), 'External - Stable - Controllable', 'External']
instances.loc[len(instances)] = [random.choice(Locus2_Stbl2_Ctrl), 'Int/Ext - Unstbl/Stbl - Controllable', 'Internal-Controllable']
instances.loc[len(instances)] = [random.choice(Int_Stbl_Unctrl), 'Internal - Stable - Uncontrollable', 'Internal-Uncontrollable']
instances
Out[7]:
| index | class_fineGrained | class_compacted | |
|---|---|---|---|
| 0 | 8 | No attribution | No attribution |
| 1 | 581 | Internal - Unstable - Controllable | Internal-Controllable |
| 2 | 959 | External - Unstable - Controllable | External |
| 3 | 556 | External - Stable - Controllable | External |
| 4 | 352 | Int/Ext - Unstbl/Stbl - Controllable | Internal-Controllable |
| 5 | 686 | Internal - Stable - Uncontrollable | Internal-Uncontrollable |
Model Training and Evaluation¶
Use nested 5-fold CV to estimate performance of the entire procedure¶
In [8]:
# Model training configs
forest = RandomForestClassifier(random_state=random_state)
param_grid = {'class_weight': ['balanced', 'balanced_subsample'],
'n_estimators': [200, 400, 600, 800, 1000],
'max_features': [0.15, 0.30, 0.45, 0.60, 0.75]} # sqrt(39) ≈ 6 ≈ 0.15*39 | (39/3 = 13) <=> (12 ≈ 0.3*39)
scoring = {'auc': 'roc_auc_ovo',
'kappa': make_scorer(cohen_kappa_score),
'precision': 'precision_macro',
'recall': 'recall_macro'}
results = pd.DataFrame(columns=['target',
'auc_mean', 'auc_std',
'kappa_mean', 'kappa_std',
'precision_mean', 'precision_std',
'recall_mean', 'recall_std'])
In [9]:
# Model training and eval for each target var
for target in tqdm(targets):
# CV configs for the inner and outer loops
inner_cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
# Nested CV with hyperparameter tuning
model = GridSearchCV(estimator=forest, param_grid=param_grid, scoring='roc_auc_ovo', cv=inner_cv)
scores = cross_validate(model, X=X_train, y=y_train[target], scoring=scoring, cv=outer_cv)
results.loc[len(results)] = [target,
np.mean(scores['test_auc']), np.std(scores['test_auc']),
np.mean(scores['test_kappa']), np.std(scores['test_kappa']),
np.mean(scores['test_precision']), np.std(scores['test_precision']),
np.mean(scores['test_recall']), np.std(scores['test_recall'])]
results
100%|████████████████████████████████████████████| 1/1 [12:45<00:00, 765.77s/it]
Out[9]:
| target | auc_mean | auc_std | kappa_mean | kappa_std | precision_mean | precision_std | recall_mean | recall_std | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | attribution | 0.751056 | 0.015974 | 0.35132 | 0.072346 | 0.471656 | 0.041525 | 0.422468 | 0.025549 |
Evaluating and Interpreting Model on Hold Out Test Set¶
Utilities¶
In [10]:
def label(target):
return ['no attribution', 'external', 'int-unctrl', 'int-ctrl']
def explain(instances, model, target):
explainer = LimeTabularExplainer(training_data=np.array(X_train),
feature_names=X_train.columns,
mode='classification',
training_labels=y_train[target].values,
class_names=label(target))
for instance in instances.itertuples():
print(f"\n🤖: Explaining the prediction for instance {instance.index}\n")
print(f"True label: {instance.class_compacted}")
print(f"\nReflection: \n{corpus['text'].loc[instance.index]}\n")
explanation = explainer.explain_instance(X_test.loc[instance.index].values, model.predict_proba, num_features=15, top_labels=4)
explanation.show_in_notebook(show_table=True)
Use 5-fold CV to select the best model + Performance on hold out test data¶
In [11]:
holdout_eval = pd.DataFrame(columns=['target']+list(scoring.keys()))
for target in targets:
print(f"\n===== {target} =====\n")
labels = label(target)
# Use 5-fold CV to select the best model
cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
model = GridSearchCV(estimator=forest, param_grid=param_grid, scoring='roc_auc_ovo', refit=True, cv=cv)
model.fit(X_train, y_train[target])
print(f"Best hyperparameters: \n{model.best_params_}\n")
print(f"Mean cross-validated auc score of the best_estimator on training set: \n{model.best_score_}\n")
# Evaluate best model (refitted on the training data) on hold out test set
print("Performance on hold out test set:")
# > Confusion matrix
cm = confusion_matrix(y_test[target], model.predict(X_test))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels).plot()
plt.show()
# > Classification report
print(classification_report(y_test[target], model.predict(X_test), target_names=labels))
# > Performance metrics
holdout_eval.loc[len(holdout_eval)] = [target,
roc_auc_score(y_test[target], model.predict_proba(X_test), average='macro', multi_class='ovo'),
cohen_kappa_score(y_test[target], model.predict(X_test)),
precision_score(y_test[target], model.predict(X_test), average='macro'),
recall_score(y_test[target], model.predict(X_test), average='macro')]
# Explain predictions using LIME
explain(instances, model, target)
===== attribution =====
Best hyperparameters:
{'class_weight': 'balanced', 'max_features': 0.75, 'n_estimators': 400}
Mean cross-validated auc score of the best_estimator on training set:
0.7566063538388739
Performance on hold out test set:
precision recall f1-score support
no attribution 0.55 0.50 0.52 22
external 0.40 0.22 0.29 9
int-unctrl 0.43 0.33 0.38 9
int-ctrl 0.75 0.84 0.79 68
accuracy 0.68 108
macro avg 0.53 0.47 0.49 108
weighted avg 0.65 0.68 0.66 108
🤖: Explaining the prediction for instance 8
True label: No attribution
Reflection:
I need to know more about RPD designs as well as which are stress bearing areas and which parts serve as direct and indirect retainers.
🤖: Explaining the prediction for instance 581 True label: Internal-Controllable Reflection: I needed further revie in Oral and Maxillofacial Pathology and Radiology.
🤖: Explaining the prediction for instance 959 True label: External Reflection: I should have passed but there was a discrepancy where the faculty thought I placed my hand too low on the thyroid cartilage.
🤖: Explaining the prediction for instance 556 True label: External Reflection: I believe this was the admissions clinic extra-oral and intra-oral examination. I failed it the first time I took it because the faculty member I was working with did not like the way I palpated the lymph nodes in the neck and submandibular region.
🤖: Explaining the prediction for instance 352 True label: Internal-Controllable Reflection: I was not sure about the information asked on the exam.
🤖: Explaining the prediction for instance 686 True label: Internal-Uncontrollable Reflection: I was unable to manage time properly which is why I was unable to finish the exam. I spent too much time in answering few questions in the beginning which is why I was unable to attempt questions in the end. Also while reviewing the exam I realized that how important the proper use of terminology is in describing a lesion.
In [12]:
holdout_eval
Out[12]:
| target | auc | kappa | precision | recall | |
|---|---|---|---|---|---|
| 0 | attribution | 0.816309 | 0.364492 | 0.532143 | 0.473448 |
In [ ]: